import os
import gym
import torch
import pygame
import numpy as np

from agent import Agent
from model import Model
from algorithm import DQN
from replay_memory import ReplayMemory

# 设置帧速率以方便展示
clock = pygame.time.Clock()

LEARN_FREQ = 5  # 训练频率，不需要每一个step都learn，攒一些新增经验后再learn，提高效率
MEMORY_SIZE = 20000  # replay memory的大小，越大越占用内存
MEMORY_WARMUP_SIZE = 200  # replay_memory 里需要预存一些经验数据，再从里面sample一个batch的经验让agent去learn
BATCH_SIZE = 32  # 每次给agent learn的数据数量，从replay memory随机里sample一批数据出来
LEARNING_RATE = 0.001  # 学习率
GAMMA = 0.99  # reward 的衰减因子，一般取 0.9 到 0.999 不等


# 训练一个episode
def run_episode(env, agent, rpm):
    # rpm: ReplayMemory 经验回放池
    total_reward = 0
    obs = env.reset()
    step = 0
    while True:
        step += 1
        action = agent.sample(obs)  # 采样动作，所有动作都有概率被尝试到

        next_obs, reward, done, _ = env.step(action)
        # print(step, action, next_obs, reward, done)
        # 将得到的信息加入经验回放池
        rpm.append((obs, action, reward, next_obs, done))

        # train model
        if (len(rpm) > MEMORY_WARMUP_SIZE) and (step % LEARN_FREQ == 0):
            (batch_obs, batch_action, batch_reward, batch_next_obs, batch_done) = rpm.sample(BATCH_SIZE)
            # s, a, r, s', done
            train_loss = agent.learn(batch_obs, batch_action, batch_reward, batch_next_obs, batch_done)

        total_reward += reward
        obs = next_obs
        if done:
            break
    return total_reward


# 评估 agent, 跑 5 个episode，总reward求平均
def evaluate(env, agent, render=False):
    eval_reward = []
    for i in range(5):
        obs = env.reset()
        episode_reward = 0
        while True:
            action = agent.predict(obs)  # 预测动作，只选最优动作
            obs, reward, done, _ = env.step(action)
            episode_reward += reward
            if render:
                # 定义帧速率
                clock.tick(50)
                env.render()
                # 解决pygame window无响应问题
                for event in pygame.event.get():
                    pass
            if done:
                break
        eval_reward.append(episode_reward)
    return np.mean(eval_reward)


def main():
    device = gpu_setup(True, 0)  # 若有GPU，使用0号GPU
    env = gym.make('CartPole-v1')
    action_dim = env.action_space.n  # CartPole-v0: 2
    obs_shape = env.observation_space.shape  # CartPole-v0: (4,)

    rpm = ReplayMemory(MEMORY_SIZE)  # DQN的经验回放池

    # 构建agent
    model = Model(input_size=obs_shape[0], output_size=action_dim)
    algorithm = DQN(model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE, device=device)
    agent = Agent(
        algorithm,
        obs_dim=obs_shape[0],
        act_dim=action_dim,
        e_greed=0.1,  # 有一定概率随机选取动作，探索
        e_greed_decrement=1e-6,  # 随着训练逐步收敛，探索的程度慢慢降低
        device=device
    )

    # 加载模型
    # save_path = './dqn_model.pkl'
    # agent.model.load_state_dict(torch.load(save_path))

    # 先往经验池里存一些数据，避免最开始训练的时候样本丰富度不够
    while len(rpm) < MEMORY_WARMUP_SIZE:
        run_episode(env, agent, rpm)

    max_episode = 2000

    # start train
    episode = 0
    while episode < max_episode:  # 训练max_episode个回合，test部分不计算入episode数量
        # train part
        for i in range(0, 50):
            total_reward = run_episode(env, agent, rpm)
            episode += 1

        # test part
        eval_reward = evaluate(env, agent, render=True)  # render=True 查看显示效果
        print('episode:{}    e_greed:{}   Test reward:{}'.format(
            episode, agent.e_greed, eval_reward))

    # 训练结束，保存模型
    # save_path = './dqn_model.pkl'
    # torch.save(agent.model.state_dict(), save_path)

    env.close()


# GPU设置
def gpu_setup(use_gpu, gpu_id):
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)

    if torch.cuda.is_available() and use_gpu:
        print('cuda available with GPU:', torch.cuda.get_device_name(0))
        device = torch.device("cuda")
    else:
        print('cuda not available')
        device = torch.device("cpu")
    return device


if __name__ == '__main__':
    main()
